1 module dataframe.hdf5; 2 import dataframe.common; 3 import dataframe.typed; 4 import hdf5.hdf5; 5 import dataframe.hdf5util; 6 import std.conv; 7 import std.csv; 8 import std.datetime; 9 import std.exception; 10 import std.range:array, stride,only; 11 import std.stdio; 12 import std.variant; 13 import std.string:isNumeric; 14 alias KalVariant=Algebraic!(string,int,long, DateTime, float,double); 15 import std.typecons:tuple,Tuple; 16 static import std.traits; 17 18 enum CHUNKSIZE=260; 19 alias DataTypes=Tuple!(string[],"columnTitles", ColumnType[],"columnTypes",int[],"offsets",int[],"sizes",int,"totalSize"); 20 // ColumnType[] 21 22 23 hid_t createDataType(DataFrameTyped frame, string name="") 24 { 25 auto tid=H5T.create(H5TClass.Compound,frame.columnSizeOf); 26 long offset=0L; 27 foreach(colTitle;frame.columnTitles) 28 { 29 //writefln("%s,%s,%s",colTitle,offset,frame.columnTypes[colTitle].columnSizeOf); 30 H5T.insert(tid,colTitle,offset,frame.columnTypes[colTitle].toH5Type); 31 offset+=frame.columnTypes[colTitle].columnSizeOf; 32 } 33 return tid; 34 } 35 36 size_t columnSizeOf(ColumnType[] types) 37 { 38 size_t ret; 39 foreach(type;types) 40 ret+=type.columnSizeOf; 41 return ret; 42 } 43 44 size_t columnSizeOf(DataFrameTyped frame) 45 { 46 size_t ret; 47 foreach(title;frame.columnTitles) 48 ret+=frame.columnTypes[title].columnSizeOf; 49 return ret; 50 } 51 52 size_t columnSizeOf(ColumnType type) 53 { 54 switch(type) with(ColumnType) 55 { 56 case Int: 57 return int.sizeof; 58 case Long: 59 return long.sizeof; 60 case Double: 61 return double.sizeof; 62 case Date: 63 return std.datetime.Date.sizeof; 64 case DateTime: 65 return std.datetime.DateTime.sizeof; 66 default: 67 throw new Exception("unknown type: "~type.to!string); 68 } 69 } 70 71 72 ubyte[] toBytes(DataFrameTyped frame) 73 { 74 auto colBytes=frame.columnSizeOf; 75 ubyte[] ret = new ubyte[colBytes*frame.numRows]; 76 foreach(row;0..frame.numRows) 77 { 78 auto rowOffset=row*colBytes; 79 auto cellOffset=rowOffset; 80 foreach(colTitle;frame.columnTitles) 81 { 82 switch(frame.columnTypes[colTitle]) with(ColumnType) 83 { 84 case Int: 85 *(cast(int*)&ret[cellOffset])=frame.values.ints[colTitle][row]; 86 cellOffset+=int.sizeof; 87 break; 88 case Long: 89 *(cast(long*)&ret[cellOffset])=frame.values.longs[colTitle][row]; 90 cellOffset+=long.sizeof; 91 break; 92 case Double: 93 *(cast(double*)&ret[cellOffset])=frame.values.doubles[colTitle][row]; 94 cellOffset+=double.sizeof; 95 break; 96 case Date: 97 cellOffset+=std.datetime.Date.sizeof; 98 break; 99 case DateTime: 100 cellOffset+=std.datetime.DateTime.sizeof; 101 break; 102 default: 103 break; 104 } 105 } 106 } 107 return ret; 108 } 109 110 auto dataFrameTypedFromFloats(float[] data,string[] columnTitles) 111 { 112 DataFrameTyped ret; 113 ColumnType[] columnTypes; 114 auto numCols=columnTitles.length; 115 auto numRows=data.length/numCols; 116 ret.setColumnTitles(columnTitles); 117 columnTypes.length=numCols; 118 foreach(ref type;columnTypes) 119 type=ColumnType.Double; 120 ret.setColumnTypes(columnTypes); 121 ret.setRows(numRows); 122 foreach(row;0..data.length/numCols) 123 { 124 foreach(col;0..numCols) 125 { 126 ret[row,columnTitles[col]]=data[row*numCols+col].to!double; 127 } 128 } 129 return ret; 130 } 131 132 DataTypes dataTypesForHDF5(string filename, string datasetName) 133 { 134 string[] names; 135 ColumnType[] types; 136 int[] offsets,sizes; 137 auto file = H5F.open(filename, H5F_ACC_RDONLY, H5P_DEFAULT); 138 auto dataset = H5D.open2(file, datasetName, H5P_DEFAULT); 139 auto s1_tid = H5D.get_type(dataset); 140 ColumnType type; 141 switch(H5T.get_class(s1_tid)) with (H5TClass) 142 { 143 case Integer: 144 type=ColumnType.Int; 145 auto ord = H5Tget_order(type); 146 auto sgn = H5Tget_sign(type); 147 auto sz = H5Tget_size(type).to!int; 148 writefln("Integer byte order = %s",ord); // H5TOrderLE or BE 149 writefln("Integer sign = %s",sgn); // H5T SGN None or 2 150 writefln("Integer size = %s",sz); 151 return DataTypes([],[ColumnType.Int],[],[sz],sz); 152 case Float: 153 return DataTypes([],[ColumnType.Double],[],[],0); 154 case Compound: 155 auto sz = H5Tget_size(s1_tid).to!int; 156 auto nmemb = H5Tget_nmembers(s1_tid); 157 writefln(" %s bytes",sz); 158 writefln(" %s members",nmemb); 159 foreach(i;0..nmemb) 160 { 161 auto s2_tid = H5T.get_member_type(s1_tid, i); 162 enforce(H5Tget_class(s2_tid) != H5TClass.Compound); 163 enforce(H5T.get_class(s2_tid) != H5TClass.Array); 164 writefln(" %s: type code %s offset %s size %s", 165 H5T.get_member_name(s1_tid, i), 166 H5T.get_class(s2_tid), 167 H5T.get_member_offset(s1_tid, i), 168 H5T.get_size(s2_tid)); 169 names~=H5T.get_member_name(s1_tid, i); 170 types~=H5T.get_class(s2_tid).h5ClassToColumnType(H5T.get_size(s2_tid).to!int); 171 offsets~=H5T.get_member_offset(s1_tid, i).to!int, 172 sizes~=H5T.get_size(s2_tid).to!int; 173 } 174 writefln("returning: %s,%s,%s,%s,%s",names,types,offsets,sizes,sz); 175 stdout.flush; 176 return DataTypes(names,types,offsets,sizes,sz); 177 default: 178 return DataTypes([],[],[],[],0); 179 } 180 } 181 182 ColumnType h5ClassToColumnType(H5TClass classType, int len) 183 { 184 switch(classType) with(H5TClass) 185 { 186 case Integer: 187 switch(len) 188 { 189 case 1,2,4: 190 return ColumnType.Int; 191 case 8: 192 return ColumnType.Long; 193 default: 194 throw new Exception("weird length: "~len.to!string); 195 } 196 case Float: 197 return ColumnType.Double; 198 default: 199 throw new Exception("unknown HDF5 class: "~classType.to!string); 200 } 201 assert(0); 202 } 203 hid_t toH5Type(ColumnType type) 204 { 205 switch(type) with(ColumnType) 206 { 207 case Int: 208 return H5T_NATIVE_INT; 209 case Long: 210 return H5T_NATIVE_LLONG; 211 case Double: 212 return H5T_NATIVE_DOUBLE; 213 default: 214 throw new Exception("unknown type: "~ type.to!string); 215 } 216 } 217 218 DataFrameTyped dataFrameTypedFromHDF5DataSet(string filename,string datasetName) 219 { 220 auto file = H5F.open(filename, H5F_ACC_RDONLY, H5P_DEFAULT); 221 auto dataset = H5D.open2(file, datasetName, H5P_DEFAULT); 222 223 auto dataType = H5D.get_type(dataset); /* datatype handle */ 224 auto t_class = H5T.get_class(dataType); 225 auto order = H5T.get_order(dataType); 226 auto size = H5T.get_size(dataType); 227 auto dataspace = H5D.get_space(dataset); /* dataspace handle */ 228 auto rank = H5S.get_simple_extent_ndims(dataspace); 229 hsize_t[2] dims_out; 230 auto status_n = H5S.get_simple_extent_dims(dataspace, dims_out); 231 enforce(rank==1, 232 new Exception("only handle vector ie rank 1 tables currently and rank="~to!string(rank))); 233 writefln("dims=%s",dims_out); 234 writefln("size=%s",size); 235 writefln("total=%s",size*dims_out[0]); 236 stdout.flush; 237 auto data = new ubyte[dims_out[0]*size]; 238 H5D.read(dataset, dataType, H5S_ALL, H5S_ALL, H5P_DEFAULT, data.ptr); 239 //debug writefln("%s", "read passed"); 240 H5T.close(dataType); 241 H5S.close(dataspace); 242 H5D.close(dataset); 243 DataFrameTyped ret; 244 auto meta=dataTypesForHDF5(filename,datasetName); 245 ret.setColumnTitles(meta.columnTitles); 246 ret.setColumnTypes(meta.columnTypes); 247 foreach(row;0..dims_out[0]) 248 { 249 auto rowOffset=meta.totalSize*row; 250 int j=0; 251 auto cellOffset=rowOffset; 252 foreach(colTitle;ret.columnTitles) 253 { 254 cellOffset=rowOffset+meta.offsets[j]; 255 //writefln("%s,%s,%s,%s,%s,%s",row,j,cellOffset,colTitle,ret.columnTypes[colTitle],meta.sizes[j]); 256 stdout.flush; 257 switch(ret.columnTypes[colTitle]) 258 { 259 case ColumnType.Int,ColumnType.Long: 260 switch(meta.sizes[j]) 261 { 262 case 1: 263 ret.values.ints[colTitle]~=(*(cast(char*)(&data[cellOffset]))).to!int; 264 break; 265 case 2: 266 ret.values.ints[colTitle]~=(*(cast(ushort*)(&data[cellOffset]))).to!int; 267 break; 268 case 4: 269 ret.values.ints[colTitle]~=(*(cast(int*)(&data[cellOffset]))); 270 break; 271 case 8: 272 ret.values.longs[colTitle]~=*cast(long*)(&data[cellOffset]); 273 break; 274 default: 275 writefln("skipping unknown field len: %s",colTitle); 276 break; 277 } 278 break; 279 case ColumnType.Double: 280 switch(meta.sizes[j]) 281 { 282 case 4: 283 ret.values.doubles[colTitle]~=(*cast(float*)&data[cellOffset]).to!double; 284 break; 285 case 8: 286 ret.values.doubles[colTitle]~=*cast(double*)&data[cellOffset]; 287 break; 288 default: 289 writefln("skipping unknown field len: %s",meta.sizes[j]); 290 break; 291 } 292 break; 293 default: 294 writefln("skipping %s",ret.columnTypes[colTitle]); 295 break; 296 } 297 ++j; 298 } 299 ++ret.numRows; 300 } 301 return ret; 302 } 303 304 305 306 307 DataFrameTyped toHDF5(DataFrameTyped frame, string filename, string datasetName, DumpMode mode=DumpMode.append, 308 bool extensible=true) 309 { 310 import std.file:exists; 311 hid_t file; 312 bool fileExists=filename.exists; 313 if (fileExists) 314 file=H5F.open(filename,H5F_ACC_RDWR, H5P_DEFAULT); 315 else 316 file = friendlyH5Create(filename,100*1024*1024,true); 317 //H5F.create(filename, H5F_ACC_TRUNC , H5P_DEFAULT, H5P_DEFAULT); 318 319 hsize_t[1] chunk_dims =[CHUNKSIZE]; 320 auto dataType = frame.createDataType; 321 ubyte[] junk; 322 junk.length=H5T.get_size(dataType); 323 writefln("%s data set length", junk.length); 324 auto data=frame.toBytes; 325 writefln("%s data set bytes", data.length); 326 hsize_t[] dim = [frame.numRows]; 327 //auto space = H5S.create_simple(dim); 328 if ((H5L.exists(file,datasetName,H5P_DEFAULT))) // does file contain our dataset 329 { 330 auto dataset = H5D.open2(file, datasetName, H5P_DEFAULT); 331 if ((mode==DumpMode.append) || (mode==DumpMode.truncate)) 332 { 333 // we should check here that it is an extensible dataset 334 auto dataTypeData = H5D.get_type(dataset); /* datatype handle */ 335 auto t_class = H5T.get_class(dataTypeData); 336 auto order = H5T.get_order(dataTypeData); 337 auto size = H5T.get_size(dataTypeData); 338 auto dataspace = H5D.get_space(dataset); /* dataspace handle */ 339 auto rank = H5S.get_simple_extent_ndims(dataspace); 340 hsize_t[1] dims_out, offset; 341 auto status_n = H5S.get_simple_extent_dims(dataspace, dims_out); 342 switch(mode) 343 { 344 case DumpMode.append: dim=[dims_out[0]+frame.numRows]; 345 offset[0] = dims_out[0]; 346 break; 347 case DumpMode.truncate: dim=[frame.numRows]; 348 offset[0]=0; 349 break; 350 default: assert(0); 351 } 352 H5D.set_extent(dataset, dim); 353 auto filespace = H5D.get_space(dataset); 354 auto dim2=[frame.numRows]; 355 H5S.select_hyperslab(filespace, H5SSeloper.Set, offset, dim2); 356 auto dataspace2 = H5S.create_simple(dim2); 357 H5D.write(dataset, dataType, dataspace2, filespace, H5P_DEFAULT, cast(ubyte*)data.ptr); 358 H5T.close(dataType); 359 H5S.close(dataspace2); 360 H5D.close(dataset); 361 return frame; 362 } 363 else // need to destroy dataset but keep others in this file 364 { 365 enforce(mode==DumpMode.unlink); 366 H5L.h5delete(file,datasetName,H5P_DEFAULT); 367 } 368 } 369 370 hsize_t[1] maxdims = extensible?[H5S_UNLIMITED]:[frame.numRows]; 371 372 auto cparms = H5P.create(H5P_DATASET_CREATE); // Modify dataset creation properties, i.e. enable chunking. 373 //debug writefln("* h5p simple created"); stdout.flush; 374 H5P.set_chunk( cparms, chunk_dims); 375 //debug writefln("* h5p set chunk"); stdout.flush; 376 auto dataspace = H5S.create_simple(dim, maxdims); 377 debug writefln("* h5s simple created"); stdout.flush; 378 //auto cparms = H5P.create(H5P_DATASET_CREATE); // Modify dataset creation properties, i.e. enable chunking. 379 H5P.set_fill_value (cparms, dataType, cast(void*)&junk); 380 //auto cparms=H5P_DEFAULT; 381 debug writefln("* creating dataset"); 382 auto dataset = H5D.create2(file, datasetName, dataType, dataspace, H5P_DEFAULT, cparms, H5P_DEFAULT); 383 // tried to disable the above - what follows on this line is wrong auto dataset = H5D.create2(file, datasetName, dataType, dataspace, H5P_DEFAULT,H5P_DEFAULT, H5P_DEFAULT); 384 debug writefln("* dataset created"); 385 auto filespace = H5D.get_space(dataset); 386 debug writefln("* writing data"); 387 H5D.write(dataset, dataType, dataspace,filespace, H5P_DEFAULT, cast(ubyte*)data.ptr); 388 //H5D.write(dataset,dataType,H5S_ALL,H5S_ALL,H5P_DEFAULT,cast(ubyte*)data.ptr); 389 debug writefln("* finished writing data"); 390 H5T.close(dataType); 391 H5S.close(dataspace); 392 H5D.close(dataset); 393 //H5D.close(filespace); 394 debug writefln("* finished closing objects"); 395 return frame; 396 } 397 398 399 400 401 auto dataFrameTypedFromSimpleHDF5Array(string filename, string groupName, string ticker, string[] columnTitles) 402 { 403 import std.stdio:writef,writefln; 404 import std.file:exists; 405 hsize_t[2] dims; 406 float[] data; 407 H5open(); 408 H5_init_library(); 409 enforce(exists(filename),new Exception(filename~" does not exist!")); 410 auto file=H5F.open(filename,H5F_ACC_RDWR, H5P_DEFAULT); 411 //auto groupID = (groupName !is null)?H5G.open2(file, groupName, H5P_DEFAULT):file; 412 auto groupID = H5G.open2(file, groupName, H5P_DEFAULT); 413 //enforce(dataSetExists(groupID,ticker), new Exception(filename~" does not contain "~ticker~"!")); 414 writefln("GT=%s/%s",groupName,ticker); 415 stdout.flush; 416 auto dataset = H5D.open2(groupID, ticker, H5P_DEFAULT); 417 auto dataspace = H5D.get_space(dataset); /* dataspace handle */ 418 auto rank = H5S.get_simple_extent_ndims(dataspace); 419 auto status = H5S.get_simple_extent_dims(dataspace, dims); 420 enforce(dims[1]==columnTitles.length); 421 writefln("dims = %s,rows= %s, columnTitles = %s",dims[1],dims[0],columnTitles.length); 422 data.length=dims[0]*dims[1]; 423 H5D.read(dataset, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, cast(ubyte*)data.ptr); 424 H5G.close(groupID); 425 H5F.close(file); 426 return data.dataFrameTypedFromFloats(columnTitles); 427 }